library(sf)
library(dplyr)
library(raster)
library(rgdal)
library(sp)
library(terra)
library(dplyr)
library(tidyr)
library(purrr)

# pops2000 <- st_read("data_raw/gpw-v4-population-count-rev11_2000_30_min_tif/gpw_v4_population_count_rev11_2000_30_min.tif")
# pops2000 <- raster::stack("data_raw/gpw-v4-population-count-rev11_2000_30_min_tif/gpw_v4_population_count_rev11_2000_30_min.tif")

pops2000 <- terra::rast("data_raw/gpw-v4-population-count-rev11_2000_30_min_tif/gpw_v4_population_count_rev11_2000_30_min.tif")


geo_epr <- read.csv("C:/Users/Frederik Gremler/Documents/paper_adoption/data/GeoEPR-2019.csv")

geo_epr <- geo_epr %>% filter(the_geom != "")

# first, I need to convert EWKB col to WKB
geo_epr$the_geom <- st_as_sfc(geo_epr$the_geom) 

# the GeoEPR csv format uses well known text
geo_epr <- st_as_sf(geo_epr, sf_column_name = "the_geom")

# expand years
geo_epr_year <- geo_epr %>% mutate(year = map2(geo_epr$from, geo_epr$to, seq)) %>% 
  unnest(year)

# only year 2000 - this is the year where we have the population data
# this of course loses groups that were not politically relevant in 2000
# but this should only be a small number
geo_epr_year <- geo_epr_year %>% dplyr::filter(year == 2000)

# HHI based on sqkm of polygons
sf_use_s2(FALSE) # s. here: https://github.com/r-spatial/sf/issues/1762

# explode the geometries, preserving ids
geo_epr_exploded <- st_cast(geo_epr_year, "POLYGON")

geo_epr_exploded_terra <- terra::vect(geo_epr_exploded)

# extract raster for polygons
pop_per_pol <- terra::extract(pops2000, geo_epr_exploded_terra, 'sum')
geo_epr_exploded$population <- pop_per_pol$gpw_v4_population_count_rev11_2000_30_min

# calculate HHI

# calculate the shares
geo_epr_imploded <- geo_epr_exploded %>% 
  group_by(gwgroupid) %>%
  mutate(total_group_population = sum(population, na.rm = T)) %>%
  # na.rm = T leads to sum(NaN, na.rm =T) == 0, i remove these cases
  ungroup() %>%
  filter(total_group_population != 0) %>%
  mutate(population_share = population/total_group_population)

# square each polygon's population share
geo_epr_imploded$population_share_sq <- geo_epr_imploded$population_share ^2

# sum the squared shares, substract from 1 for right direction of coefs
geo_epr_imploded <- geo_epr_imploded %>%
  dplyr::group_by(gwgroupid) %>%
  dplyr::summarize(spatial_hhi = 1 -sum(population_share_sq, na.rm = T),
            n_polygons = n()) 


# save for join in data file
write_csv(geo_epr_imploded, "data/hhi.csv")

############
### check
############

# check whether this conforms to the summed values in epr
# make populations per group
group_populations <- geo_epr_exploded %>%
                      dplyr::group_by(gwgroupid) %>%
                      dplyr::summarize(summed_pop = sum(population, na.rm = TRUE))

# merge back to geo-epr data
geo_epr_year <- left_join(geo_epr_year, group_populations %>% as.data.frame() %>%  dplyr::select(gwgroupid, summed_pop), by = c("gwgroupid"))

epr <- read_csv(file.path("data_raw", "EPR_merged.csv"))

# checks
geo_epr_year <- left_join(geo_epr_year, epr %>% filter(year == 2000) %>% select(gwgroupid, pop00_total), by = "gwgroupid")
cor(geo_epr_year$pop00_total, geo_epr_year$summed_pop, use = "complete.obs")
# this is basically 1, so it works (difference probably due to rounding errors)

